Doublechecking net values

Some people have been reporting problems in the value of net_value column in the CEAP datasets. It doesn't seem to really match what it should contain.

In [1]:
import pandas as pd
import numpy as np

filenames = ['../data/2016-08-08-current-year.xz',
             '../data/2016-08-08-last-year.xz',
             '../data/2016-08-08-previous-years.xz']
dataset = pd.DataFrame()

for filename in filenames:
    data = pd.read_csv(filename,
                       parse_dates=[16],
                       dtype={'document_id': np.str,
                              'congressperson_id': np.str,
                              'congressperson_document': np.str,
                              'term_id': np.str,
                              'cnpj_cpf': np.str,
                              'reimbursement_number': np.str})
    dataset = pd.concat([dataset, data])
In [2]:
len(dataset)
Out[2]:
2072729
In [3]:
dataset['issue_date'] = pd.to_datetime(dataset['issue_date'], errors='coerce')
In [4]:
(dataset['document_value'].isnull()).sum()
Out[4]:
17
In [5]:
dataset[dataset['document_value'].isnull()]
Out[5]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... net_value month year installment passenger leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id
72863 5021378 ALMEIDA LIMA 160626 173 2011.0 SE PMDB 54 5 Publicity of parliamentary activity ... 10000.00 10 2012 0 NaN NaN 1005279 4159 NaN 2370
72864 5021389 ALMEIDA LIMA 160626 173 2011.0 SE PMDB 54 5 Publicity of parliamentary activity ... 11300.00 10 2012 0 NaN NaN 1005279 4159 NaN 2370
243933 5028079 CARLOS ZARATTINI 141398 398 2015.0 SP PT 55 10 Telecommunication ... 29.90 2 2013 0 NaN NaN 1007122 4189 NaN 1963
286036 5028084 DANIEL ALMEIDA 74060 188 2015.0 BA PCdoB 55 3 Fuels and lubricants ... 3113.38 1 2013 0 NaN NaN 1006976 4171 NaN 1562
422966 5062354 ELCIONE BARBALHO 74075 21 2015.0 PA PMDB 55 13 Congressperson meal ... 26.87 3 2013 0 NaN NaN 1016826 4238 NaN 1011
513864 5073020 FRANCISCO ESCÓRCIO 137983 567 2011.0 MA PMDB 54 3 Fuels and lubricants ... 150.00 3 2013 0 NaN NaN 1020110 4250 NaN 1762
652869 5063133 JANETE ROCHA PIETÁ 141455 358 2011.0 SP PT 54 15 Aircraft renting or charter of aircraft ... 15.00 3 2013 0 NaN NaN 1018289 4248 NaN 1852
670406 5020275 JESUS RODRIGUES 160671 115 2011.0 PI PT 54 4 Consultancy, research and technical work ... 7000.00 2 2013 0 NaN NaN 1004983 4185 NaN 2380
1031812 5069601 MISSIONÁRIO JOSÉ OLIMPIO 160561 375 2015.0 SP DEM 55 3 Fuels and lubricants ... 130.01 2 2013 0 NaN NaN 1018899 4247 NaN 2388
1095064 5009532 ONYX LORENZONI 74399 510 2015.0 RS DEM 55 3 Fuels and lubricants ... 2755.87 1 2013 0 NaN NaN 1002170 4139 NaN 1627
1118504 5030571 PADRE JOÃO 160556 259 2015.0 MG PT 55 3 Fuels and lubricants ... 137.20 2 2013 0 NaN NaN 1009721 4200 NaN 2305
1180554 5016649 PEDRO HENRY 74111 403 2011.0 MT PP 54 9 Flight tickets ... 486.95 1 2013 0 NaN Cuiabá/Brasilia 1003807 4187 NaN 1227
1212495 5020247 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 1 Participation in course, talk or similar event ... 223.30 1 2013 0 NaN NaN 1005054 4187 NaN 1244
1213738 5020251 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 8 Security service provided by specialized company ... 140.00 1 2013 0 NaN NaN 1005054 4187 NaN 1244
1214309 5020523 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 181.84 1 2013 0 NaN NaN 1005053 4187 NaN 1244
1214331 5020531 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 155.87 1 2013 0 NaN NaN 1005053 4187 NaN 1244
1214332 5020430 RAIMUNDO GOMES DE MATOS 74216 102 2015.0 CE PSDB 55 10 Telecommunication ... 4.70 1 2013 0 NaN NaN 1005053 4187 NaN 1244

17 rows × 29 columns

In [6]:
dataset[dataset['document_value'].isnull()].iloc[0]
Out[6]:
document_id                                               5021378
congressperson_name                                  ALMEIDA LIMA
congressperson_id                                          160626
congressperson_document                                       173
term                                                         2011
state                                                          SE
party                                                        PMDB
term_id                                                        54
subquota_number                                                 5
subquota_description          Publicity of parliamentary activity
subquota_group_id                                               0
subquota_group_description                                    NaN
supplier                                        JULIANO AMADEU ME
cnpj_cpf                                           11901432000190
document_number                                          00000007
document_type                                                   0
issue_date                                    2012-08-20 00:00:00
document_value                                                NaN
remark_value                                                 5000
net_value                                                   10000
month                                                          10
year                                                         2012
installment                                                     0
passenger                                                     NaN
leg_of_the_trip                                               NaN
batch_number                                              1005279
reimbursement_number                                         4159
reimbursement_value                                           NaN
applicant_id                                                 2370
Name: 72863, dtype: object

Since we expect precision in our net_value calculation, I'm going to use integers and not floats.

In [7]:
import math

dataset = dataset.dropna(subset=['document_value'])
dataset['document_value_int'] = (dataset['document_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['remark_value_int'] = (dataset['remark_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['net_value_int'] = (dataset['net_value'] * 100.).apply(math.ceil).astype(np.int)
dataset['calc_net_value_int'] = dataset['document_value_int'] - dataset['remark_value_int']
In [8]:
((dataset['calc_net_value_int'] - dataset['net_value_int']) != 0).sum()
Out[8]:
21434
In [9]:
dataset.iloc[0]
Out[9]:
document_id                                                             5928744
congressperson_name                                           ABEL MESQUITA JR.
congressperson_id                                                        178957
congressperson_document                                                       1
term                                                                       2015
state                                                                        RR
party                                                                       DEM
term_id                                                                      55
subquota_number                                                               1
subquota_description          Maintenance of office supporting parliamentary...
subquota_group_id                                                             0
subquota_group_description                                                  NaN
supplier                                COMPANHIA DE AGUAS E ESGOTOS DE RORAIMA
cnpj_cpf                                                         05939467000115
document_number                                                          146439
document_type                                                                 0
issue_date                                                  2016-02-15 00:00:00
document_value                                                            37.37
remark_value                                                                  6
net_value                                                                 31.37
month                                                                         1
year                                                                       2016
installment                                                                   0
passenger                                                                   NaN
leg_of_the_trip                                                             NaN
batch_number                                                            1268870
reimbursement_number                                                       5369
reimbursement_value                                                         NaN
applicant_id                                                               3074
document_value_int                                                         3737
remark_value_int                                                            600
net_value_int                                                              3137
calc_net_value_int                                                         3137
Name: 0, dtype: object
In [10]:
dataset['diff_net_value'] = dataset['net_value_int'] - dataset['calc_net_value_int']
dataset.loc[dataset['diff_net_value'] != 0, 'diff_net_value'].describe()
Out[10]:
count    2.143400e+04
mean    -6.495887e+04
std      3.402186e+05
min     -9.712858e+06
25%     -1.086200e+04
50%      1.000000e+00
75%      1.000000e+00
max      1.763608e+06
Name: diff_net_value, dtype: float64

What's the number of records with distinct values of net_value and our own net_value, considering acceptable a maximum difference of 2 cents?

In [11]:
with_significant_difference = dataset.loc[dataset['diff_net_value'].abs() > 2]
In [12]:
with_significant_difference['subquota_description'].describe()
Out[12]:
count                    9457
unique                     17
top       Flight ticket issue
freq                     4319
Name: subquota_description, dtype: object

It's not just a single subquota, but almost all of them. Probably means that there's something intrinsic in the dataset we still don't know.

In [13]:
print(len(dataset['subquota_description'].unique()))
print(len(with_significant_difference['subquota_description'].unique()))
18
17
In [14]:
from altair import *

Chart(with_significant_difference).mark_bar().encode(
    x=X('subquota_description:O',
        sort=SortField(field='subquota_description',
                       order='descending',
                       op='count')),
    y='count(*):Q',
)
In [15]:
with_significant_difference['subquota_description'].unique()
Out[15]:
array(['Security service provided by specialized company',
       'Fuels and lubricants', 'Publicity of parliamentary activity',
       'Automotive vehicle renting or charter',
       'Maintenance of office supporting parliamentary activity',
       'Telecommunication', 'Congressperson meal',
       'Taxi, toll and parking',
       'Consultancy, research and technical work',
       'Lodging, except for congressperson from Distrito Federal',
       'Flight tickets', 'Publication subscriptions',
       'Flight ticket issue', 'Aircraft renting or charter of aircraft',
       'Participation in course, talk or similar event', 'Postal services',
       'Watercraft renting or charter'], dtype=object)

I want to perform a further investigation in flight ticket issues, since it's already documented that net_value's may be negative given canceled flights.

In [16]:
flight_ticket_issues = with_significant_difference['subquota_description'] == 'Flight ticket issue'
with_significant_difference.loc[flight_ticket_issues, 'net_value'].head(10)
Out[16]:
37628     1870.16
49896      815.26
49922      839.32
62640     -480.58
62641      961.16
65229       58.00
77422      495.16
78867     1329.26
101579     260.00
116316     298.89
Name: net_value, dtype: float64

Here we have 3 documents with reimbursement_number equal to zero. How about disconsidering them to start? If there's no reimbursement_number, we could assume that no reimbursement happened (if the data says data, the Chamber of Deputies should be contacted to confirm the affirmation).

In [17]:
with_significant_difference.loc[flight_ticket_issues & \
                                (with_significant_difference['congressperson_id'] == '178983')]
Out[17]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
116316 NaN MARCIO ALVINO 178983 370 2015.0 SP PR 55 999 Flight ticket issue ... BSB/GRU 0 0 NaN 3061 9963 0 29889 9963 19926
116333 NaN MARCIO ALVINO 178983 370 2015.0 SP PR 55 999 Flight ticket issue ... BSB/GRU 0 0 NaN 3061 9963 0 29889 9963 19926
116396 NaN MARCIO ALVINO 178983 370 2015.0 SP PR 55 999 Flight ticket issue ... BSB/GRU 0 0 NaN 3061 17063 0 34126 17063 17063

3 rows × 34 columns

In [18]:
dataset['reimbursement_number'] = \
    dataset['reimbursement_number'].replace('0', None)
dataset = dataset.dropna(subset=['reimbursement_number'])
In [19]:
dataset['document_id'].isnull().sum()
Out[19]:
972391
In [20]:
with_significant_difference = dataset.loc[dataset['diff_net_value'].abs() > 2]
len(with_significant_difference)
Out[20]:
9307
In [21]:
flight_ticket_issues = with_significant_difference['subquota_description'] == 'Flight ticket issue'
with_significant_difference.loc[flight_ticket_issues].head()
Out[21]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
37628 NaN CLARISSA GAROTINHO 178939 294 2015.0 RJ PR 55 999 Flight ticket issue ... SDU/BSB 0 5506 NaN 3036 93508 0 187016 93508 93508
49896 NaN DULCE MIRANDA 178994 61 2015.0 TO PMDB 55 999 Flight ticket issue ... BSB/PMW 0 5440 NaN 2905 40763 0 81526 40763 40763
49922 NaN DULCE MIRANDA 178994 61 2015.0 TO PMDB 55 999 Flight ticket issue ... PMW/BSB 0 5440 NaN 2905 41966 0 83932 41966 41966
62640 NaN FÉLIX MENDONÇA JÚNIOR 160666 195 2015.0 BA PDT 55 999 Flight ticket issue ... SSA/BSB 0 5463 NaN 2307 -24029 0 -48058 -24029 -24029
62641 NaN FÉLIX MENDONÇA JÚNIOR 160666 195 2015.0 BA PDT 55 999 Flight ticket issue ... SSA/BSB 0 5463 NaN 2307 48058 0 96116 48058 48058

5 rows × 34 columns

In [22]:
with_significant_difference.loc[flight_ticket_issues].iloc[0]
Out[22]:
document_id                                   NaN
congressperson_name            CLARISSA GAROTINHO
congressperson_id                          178939
congressperson_document                       294
term                                         2015
state                                          RJ
party                                          PR
term_id                                        55
subquota_number                               999
subquota_description          Flight ticket issue
subquota_group_id                               0
subquota_group_description                    NaN
supplier                          Cia Aérea - GOL
cnpj_cpf                           07575651000159
document_number                   Bilhete: CGB8TZ
document_type                                   0
issue_date                    2016-04-07 00:00:00
document_value                             935.08
remark_value                                    0
net_value                                 1870.16
month                                           4
year                                         2016
installment                                     0
passenger                      CLARISSA GAROTINHO
leg_of_the_trip                           SDU/BSB
batch_number                                    0
reimbursement_number                         5506
reimbursement_value                           NaN
applicant_id                                 3036
document_value_int                          93508
remark_value_int                                0
net_value_int                              187016
calc_net_value_int                          93508
diff_net_value                              93508
Name: 37628, dtype: object
In [23]:
dataset[(dataset['congressperson_name'] == 'CLARISSA GAROTINHO') & \
        (dataset['month'] == 4) & \
        (dataset['year'] == 2016) & \
        (dataset['subquota_description'] == 'Flight ticket issue')]['net_value_int'].sum()
Out[23]:
794619
In [24]:
dataset.loc[dataset['document_number'] == 'Bilhete: VHVK6G']
Out[24]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
37661 NaN CLARISSA GAROTINHO 178939 294 2015.0 RJ PR 55 999 Flight ticket issue ... BSB/SDU 0 5506 NaN 3036 73813 0 73813 73813 0
37662 NaN CLARISSA GAROTINHO 178939 294 2015.0 RJ PR 55 999 Flight ticket issue ... BSB/SDU 0 5506 NaN 3036 73813 0 73813 73813 0
37663 NaN CLARISSA GAROTINHO 178939 294 2015.0 RJ PR 55 999 Flight ticket issue ... BSB/BSB 0 5506 NaN 3036 -51363 0 -51363 -51363 0

3 rows × 34 columns

In [25]:
dataset.loc[dataset['document_number'] == 'Bilhete: VHVK6G'].iloc[0]
Out[25]:
document_id                                   NaN
congressperson_name            CLARISSA GAROTINHO
congressperson_id                          178939
congressperson_document                       294
term                                         2015
state                                          RJ
party                                          PR
term_id                                        55
subquota_number                               999
subquota_description          Flight ticket issue
subquota_group_id                               0
subquota_group_description                    NaN
supplier                          Cia Aérea - GOL
cnpj_cpf                           07575651000159
document_number                   Bilhete: VHVK6G
document_type                                   0
issue_date                    2016-04-13 00:00:00
document_value                             738.13
remark_value                                    0
net_value                                  738.13
month                                           4
year                                         2016
installment                                     0
passenger                      CLARISSA GAROTINHO
leg_of_the_trip                           BSB/SDU
batch_number                                    0
reimbursement_number                         5506
reimbursement_value                           NaN
applicant_id                                 3036
document_value_int                          73813
remark_value_int                                0
net_value_int                               73813
calc_net_value_int                          73813
diff_net_value                                  0
Name: 37661, dtype: object
In [26]:
dataset.loc[dataset['document_number'] == 'Bilhete: VHVK6G'].iloc[1]
Out[26]:
document_id                                   NaN
congressperson_name            CLARISSA GAROTINHO
congressperson_id                          178939
congressperson_document                       294
term                                         2015
state                                          RJ
party                                          PR
term_id                                        55
subquota_number                               999
subquota_description          Flight ticket issue
subquota_group_id                               0
subquota_group_description                    NaN
supplier                          Cia Aérea - GOL
cnpj_cpf                           07575651000159
document_number                   Bilhete: VHVK6G
document_type                                   0
issue_date                    2016-04-13 00:00:00
document_value                             738.13
remark_value                                    0
net_value                                  738.13
month                                           4
year                                         2016
installment                                     0
passenger                     SAMPAIO/ANACAROLINA
leg_of_the_trip                           BSB/SDU
batch_number                                    0
reimbursement_number                         5506
reimbursement_value                           NaN
applicant_id                                 3036
document_value_int                          73813
remark_value_int                                0
net_value_int                               73813
calc_net_value_int                          73813
diff_net_value                                  0
Name: 37662, dtype: object
In [27]:
dataset.loc[dataset['document_number'] == 'Bilhete: VHVK6G'].iloc[2]
Out[27]:
document_id                                   NaN
congressperson_name            CLARISSA GAROTINHO
congressperson_id                          178939
congressperson_document                       294
term                                         2015
state                                          RJ
party                                          PR
term_id                                        55
subquota_number                               999
subquota_description          Flight ticket issue
subquota_group_id                               0
subquota_group_description                    NaN
supplier                          Cia Aérea - GOL
cnpj_cpf                           07575651000159
document_number                   Bilhete: VHVK6G
document_type                                   0
issue_date                    2016-04-19 00:00:00
document_value                            -513.63
remark_value                                    0
net_value                                 -513.63
month                                           4
year                                         2016
installment                                     0
passenger                      CLARISSA GAROTINHO
leg_of_the_trip                           BSB/BSB
batch_number                                    0
reimbursement_number                         5506
reimbursement_value                           NaN
applicant_id                                 3036
document_value_int                         -51363
remark_value_int                                0
net_value_int                              -51363
calc_net_value_int                         -51363
diff_net_value                                  0
Name: 37663, dtype: object

The dataset contains multiple records for the same document_number. In this case, they correspond to multiple flight tickets to the same flight, but for distinct passengers.

In [28]:
dataset.loc[dataset['document_number'] == 'Bilhete: VHVK6G',
            ['document_value_int', 'remark_value_int', 'document_number', 'reimbursement_number', 'passenger', 'net_value_int', 'calc_net_value_int']]
Out[28]:
document_value_int remark_value_int document_number reimbursement_number passenger net_value_int calc_net_value_int
37661 73813 0 Bilhete: VHVK6G 5506 CLARISSA GAROTINHO 73813 73813
37662 73813 0 Bilhete: VHVK6G 5506 SAMPAIO/ANACAROLINA 73813 73813
37663 -51363 0 Bilhete: VHVK6G 5506 CLARISSA GAROTINHO -51363 -51363

http://jarbas.datasciencebr.com/#/document_id/5914504

These documents seem to correspond to the same receipt. The reimbursements were claimed in the same batch_number but in distinct reimbursements (reimbursement_number).

Together, document_value, remark_value and net_value make sense.

In [29]:
with_significant_difference[with_significant_difference['document_id'] == '5914504']
Out[29]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
27 5914504 ABEL MESQUITA JR. 178957 1 2015.0 RR DEM 55 8 Security service provided by specialized company ... NaN 1264155 5381 NaN 3074 850000 0 20000 850000 -830000
28 5914504 ABEL MESQUITA JR. 178957 1 2015.0 RR DEM 55 8 Security service provided by specialized company ... NaN 1264155 5386 NaN 3074 850000 0 830000 850000 -20000

2 rows × 34 columns

The Chamber of Deputies just use applicant_id, year and document_id when refering to a reimbursement. If all of them together count as a single reimbursement, an expense is an aggregation of all rows with the same above. attributes.

Let's prove the affirmation above analysing the whole dataset and checking if the values make more sense after.

In [30]:
data_with_id = dataset[(~dataset['document_id'].isnull()) &
                       (~dataset['year'].isnull()) &
                       (~dataset['applicant_id'].isnull())]
In [31]:
keys = ['applicant_id', 'year', 'document_id']
grouped = data_with_id.groupby(keys)
len(grouped)
Out[31]:
1097629
In [32]:
reimbursement_numbers = grouped['reimbursement_number'].agg(lambda x: ','.join(set(x))).reset_index()
agg_net_values_int = grouped['net_value_int'].agg(np.sum).reset_index()
In [33]:
agg_net_values_int.head()
Out[33]:
applicant_id year document_id net_value_int
0 12 2011 1988754 238754
1 12 2011 1988755 211247
2 12 2011 1988756 423550
3 12 2011 1996863 200000
4 12 2011 1996874 200000
In [34]:
agg_data = pd.merge(pd.merge(reimbursement_numbers, agg_net_values_int, on=keys),
                    data_with_id,
                    on=keys,
                    suffixes=('', '_from_original'))
agg_data.head()
Out[34]:
applicant_id year document_id reimbursement_number net_value_int congressperson_name congressperson_id congressperson_document term state ... passenger leg_of_the_trip batch_number reimbursement_number_from_original reimbursement_value document_value_int remark_value_int net_value_int_from_original calc_net_value_int diff_net_value
0 12 2011 1988754 3513 238754 JOSÉ MENDONÇA BEZERRA 74425 149 2007.0 PE ... NaN NaN 513758 3513 NaN 238754 0 238754 238754 0
1 12 2011 1988755 3513 211247 JOSÉ MENDONÇA BEZERRA 74425 149 2007.0 PE ... NaN NaN 513758 3513 NaN 236640 25393 211247 211247 0
2 12 2011 1988756 3513 423550 JOSÉ MENDONÇA BEZERRA 74425 149 2007.0 PE ... NaN NaN 513759 3513 NaN 423550 0 423550 423550 0
3 12 2011 1996863 3535 200000 JOSÉ MENDONÇA BEZERRA 74425 149 2007.0 PE ... NaN NaN 516032 3535 NaN 200000 0 200000 200000 0
4 12 2011 1996874 3535 200000 JOSÉ MENDONÇA BEZERRA 74425 149 2007.0 PE ... NaN NaN 516032 3535 NaN 200000 0 200000 200000 0

5 rows × 36 columns

There are 2,072,559 documents in the datasets, but when considering just the combination of non-empty values for applicant_id, year and document_id, half of that is found. In other words, the CEAP seem to have paid for about 1MM expenses so far.

In [35]:
len(agg_data)
Out[35]:
1100168
In [36]:
agg_data.drop_duplicates(subset=keys, inplace=True)

len(agg_data)
Out[36]:
1097629
In [37]:
agg_data.drop(['reimbursement_number_from_original',
               'net_value_int_from_original'],
              axis=1,
              inplace=True)
In [38]:
agg_data['document_value_int'] = (agg_data['document_value'] * 100.).apply(math.ceil).astype(np.int)
agg_data['remark_value_int'] = (agg_data['remark_value'] * 100.).apply(math.ceil).astype(np.int)
# agg_data['net_value_int'] = (agg_data['net_value'] * 100.).apply(math.ceil).astype(np.int)
agg_data['calc_net_value_int'] = agg_data['document_value_int'] - agg_data['remark_value_int']
agg_data['diff_net_value'] = agg_data['net_value_int'] - agg_data['calc_net_value_int']
In [39]:
with_significant_difference = agg_data.loc[agg_data['diff_net_value'].abs() > 2]

Disconsidering multiple records with this same combination, just 744 remain with a large difference between our own calculation of net_values and the value directly in the original dataset.

In [40]:
len(with_significant_difference)
Out[40]:
744

The majority of records were explained by the previous test, having all the flight ticket issues disappeared (though a few "Flight tickets" remain).

In [41]:
Chart(with_significant_difference).mark_bar().encode(
    x=X('subquota_description:O',
        sort=SortField(field='subquota_description',
                       order='descending',
                       op='count')),
    y='count(*):Q',
)
In [42]:
agg_data[agg_data['reimbursement_number'].str.contains(',')]
Out[42]:
applicant_id year document_id reimbursement_number net_value_int congressperson_name congressperson_id congressperson_document term state ... month installment passenger leg_of_the_trip batch_number reimbursement_value document_value_int remark_value_int calc_net_value_int diff_net_value
326 19 2015 5745646 5105,5106 520001 BENITO GAMA 74535 190 2015.0 BA ... 7 0 NaN NaN 1212128 NaN 520000 0 520000 1
1058 59 2014 5357448 4617,4945,4608,4600 400000 COSTA FERREIRA 74016 571 2011.0 MA ... 3 0 NaN NaN 1096995 NaN 400000 0 400000 0
1594 80 2013 5121244 4293,4296 1225000 PAES LANDIM 74319 118 2015.0 PI ... 5 0 NaN TERESINA/SÃO LUIZ/PARNAIBA/SÃO JOÃO DO PI/SÃO ... 1032094 NaN 1225000 0 1225000 0
1609 80 2013 5160515 4344,4357 1329997 PAES LANDIM 74319 118 2015.0 PI ... 6 0 NaN TERES/BOM JESUS/SJPI/SRN/SJPI/PARNAIBA/CORRENT... 1042476 NaN 1330000 0 1330000 -3
1620 80 2013 5186721 4375,4381 1578098 PAES LANDIM 74319 118 2015.0 PI ... 7 0 NaN TERESINA/PARNAIBA/OEIRAS/BOM JESUS/SRN/FRONTEI... 1049416 NaN 1578100 0 1578100 -2
1635 80 2013 5248632 4435,4431 770000 PAES LANDIM 74319 118 2015.0 PI ... 10 0 NaN TERESINA/SÃO JOÃO DO PI/BOM JESUS/TERESINA 1065779 NaN 770000 0 770000 0
1647 80 2013 5279584 4458,4455 1556004 PAES LANDIM 74319 118 2015.0 PI ... 11 0 NaN NaN 1074447 NaN 1556000 0 1556000 4
1659 80 2013 5292649 4475,4504 70907 PAES LANDIM 74319 118 2015.0 PI ... 12 1 NaN NaN 1078190 NaN 78899 589 78310 -7403
1688 80 2014 5397666 4648,4653 442785 PAES LANDIM 74319 118 2015.0 PI ... 4 0 NaN NaN 1108899 NaN 442786 0 442786 -1
1866 80 2015 5817256 5191,5202 40000 PAES LANDIM 74319 118 2015.0 PI ... 9 0 NaN NaN 1233889 NaN 40000 0 40000 0
1877 80 2015 5849313 5238,5233 195001 PAES LANDIM 74319 118 2015.0 PI ... 10 0 NaN NaN 1243723 NaN 195000 0 195000 1
1885 80 2015 5867480 5256,5339,5325 1530501 PAES LANDIM 74319 118 2015.0 PI ... 11 0 NaN NaN 1249225 NaN 1530500 0 1530500 1
1920 80 2016 5956757 5402,5404 63600 PAES LANDIM 74319 118 2015.0 PI ... 3 0 NaN NaN 1277564 NaN 63600 0 63600 0
2818 98 2013 5309110 4485,4541 1680001 SARNEY FILHO 74210 85 2015.0 MA ... 12 0 NaN NaN 1083289 NaN 1680000 0 1680000 1
2835 98 2014 5318674 4526,4518 500000 SARNEY FILHO 74210 85 2015.0 MA ... 1 0 NaN NaN 1085766 NaN 500000 0 500000 0
5198 116 2012 2453727 4159,4128 450000 ROBERTO FREIRE 73805 546 2015.0 SP ... 12 0 NaN NaN 656049 NaN 450000 0 450000 0
9033 137 2015 5849473 5238,5234 175175 GONZAGA PATRIOTA 74419 143 2015.0 PE ... 11 0 NaN NaN 1243776 NaN 175175 0 175175 0
9051 137 2015 5873233 5256,5325 2950003 GONZAGA PATRIOTA 74419 143 2015.0 PE ... 12 0 NaN NaN 1251199 NaN 2950000 0 2950000 3
10277 169 2013 5205652 4399,4400 36910 SÉRGIO BRITO 73808 213 2015.0 BA ... 9 0 NaN NaN 1056847 NaN 40600 3691 36909 1
10344 169 2013 5242008 4435,4420 650000 SÉRGIO BRITO 73808 213 2015.0 BA ... 8 0 NaN NaN 1063950 NaN 650000 0 650000 0
10377 169 2013 5263424 4458,4443 1000001 SÉRGIO BRITO 73808 213 2015.0 BA ... 11 0 NaN NaN 1069702 NaN 1000000 0 1000000 1
14314 184 2013 5333503 4582,4580 17255 SIMÃO SESSIM 73424 327 2015.0 RJ ... 12 0 NaN NaN 1092547 NaN 17255 0 17255 0
16723 187 2013 5323104 4541,4534 270886 AROLDE DE OLIVEIRA 74833 288 2015.0 RJ ... 12 0 NaN NaN 1086981 NaN 285949 15063 270886 0
17638 188 2012 2451832 4159,4144 345001 BENEDITA DA SILVA 73701 291 2015.0 RJ ... 12 0 NaN NaN 655339 NaN 345000 0 345000 1
18453 188 2014 5585189 4908,4945 310024 BENEDITA DA SILVA 73701 291 2015.0 RJ ... 12 1 NaN NaN 1164535 NaN 534828 123243 411585 -101561
18865 188 2015 5831444 5231,5230,5212 675800 BENEDITA DA SILVA 73701 291 2015.0 RJ ... 10 0 NaN NaN 1238057 NaN 702300 26500 675800 0
20378 191 2015 5637855 4977,4979 135914 BONIFÁCIO DE ANDRADA 74650 222 2015.0 MG ... 3 0 NaN NaN 1177800 NaN 141589 5675 135914 0
20627 191 2015 5878495 5284,5359,5339 36208 BONIFÁCIO DE ANDRADA 74650 222 2015.0 MG ... 11 0 NaN NaN 1252881 NaN 68365 0 68365 -32157
23799 198 2016 6038440 5491,5492 800000 JUTAHY JUNIOR 74570 206 2015.0 BA ... 7 0 NaN NaN 1303372 NaN 800000 0 800000 0
24570 257 2012 5000679 4159,4152,4195 289380 LAEL VARELLA 74742 245 2011.0 MG ... 12 1 NaN NaN 1000095 NaN 300000 0 300000 -10620
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1086713 3073 2015 5876013 5319,5296,5325 222001 JOÃO DANIEL 178970 179 2015.0 SE ... 11 0 NaN NaN 1251955 NaN 222000 0 222000 1
1086719 3073 2015 5914582 5356,5346 150000 JOÃO DANIEL 178970 179 2015.0 SE ... 12 0 NaN NaN 1264190 NaN 150000 0 150000 0
1086745 3073 2016 5955226 5399,5397 900000 JOÃO DANIEL 178970 179 2015.0 SE ... 3 0 NaN NaN 1277080 NaN 900000 0 900000 0
1086907 3074 2016 5914504 5381,5386 850000 ABEL MESQUITA JR. 178957 1 2015.0 RR ... 1 0 NaN NaN 1264155 NaN 850000 0 850000 0
1087195 3075 2015 5726880 5087,5106 1500004 CABO DACIOLO 178938 289 2015.0 RJ ... 6 0 NaN NaN 1206279 NaN 1500000 0 1500000 4
1087359 3075 2015 5807618 5202,5185 1500004 CABO DACIOLO 178938 289 2015.0 RJ ... 9 0 NaN NaN 1231443 NaN 1500000 0 1500000 4
1087374 3075 2015 5818120 5238,5214 269093 CABO DACIOLO 178938 289 2015.0 RJ ... 8 0 NaN NaN 1234241 NaN 269793 700 269093 0
1087463 3075 2015 5880780 5359,5387,5294,5356,5400,5364,5339,5325 751402 CABO DACIOLO 178938 289 2015.0 RJ ... 12 1 NaN NaN 1253719 NaN 1500000 0 1500000 -748598
1088352 3077 2016 5982353 5435,5439 174966 TENENTE LÚCIO 178898 266 2015.0 MG ... 4 0 NaN NaN 1285702 NaN 174966 0 174966 0
1088603 3078 2015 5883832 5301,5280,5319,5391,5339,5325 1816840 TIA ERON 178862 217 2015.0 BA ... 12 1 NaN NaN 1254560 NaN 1980000 0 1980000 -163160
1088699 3078 2016 6044497 5503,5501 269375 TIA ERON 178862 217 2015.0 BA ... 6 0 NaN NaN 1305593 NaN 271972 2597 269375 0
1089427 3081 2015 5909521 5334,5339 800000 RENATA ABREU 178989 330 2015.0 SP ... 12 0 NaN NaN 1262455 NaN 800000 0 800000 0
1092885 3085 2015 5888879 5319,5310,5325 220000 ANA PERUGINI 178973 332 2015.0 SP ... 11 0 NaN NaN 1256102 NaN 220000 0 220000 0
1094314 3089 2015 5786501 5172,5168 200000 DAVIDSON MAGALHÃES 101309 525 2015.0 BA ... 8 0 NaN NaN 1224634 NaN 200000 0 200000 0
1094398 3089 2015 5836550 5238,5235 170000 DAVIDSON MAGALHÃES 101309 525 2015.0 BA ... 10 0 NaN NaN 1239746 NaN 170000 0 170000 0
1094449 3089 2015 5877799 5356,5263 53724 DAVIDSON MAGALHÃES 101309 525 2015.0 BA ... 11 0 NaN salvador/brasilia 1253117 NaN 53724 0 53724 0
1094666 3089 2016 5994951 5459,5454 105002 DAVIDSON MAGALHÃES 101309 525 2015.0 BA ... 4 0 NaN NaN 1290083 NaN 105000 0 105000 2
1094720 3089 2016 6025536 5488,5474 113666 DAVIDSON MAGALHÃES 101309 525 2015.0 BA ... 5 0 NaN NaN 1299751 NaN 113667 0 113667 -1
1095353 3090 2016 6002707 5451,5459 700000 FERNANDO MONTEIRO 92699 529 2015.0 PE ... 4 0 NaN NaN 1292326 NaN 700000 0 700000 0
1095362 3090 2016 6013860 5473,5488 500001 FERNANDO MONTEIRO 92699 529 2015.0 PE ... 5 0 NaN NaN 1296046 NaN 500000 0 500000 1
1095439 3090 2016 6048224 5511,5498 42268 FERNANDO MONTEIRO 92699 529 2015.0 PE ... 6 0 RAFAEL GUILHERME CAETANO SANTOS REC/PNZ 1306716 NaN 42268 0 42268 0
1096009 3092 2015 5888003 5450,5339,5359,5330,5356,5364,5387 826296 MARQUINHO MENDES 181315 536 2015.0 RJ ... 12 1 NaN NaN 1258436 NaN 890000 20000 870000 -43704
1097777 3118 2015 5884395 5319,5310,5339,5325 61026 FRANKLIN LIMA 186775 544 2015.0 MG ... 11 1 NaN NaN 1254731 NaN 161020 0 161020 -99994
1097968 3118 2016 6022394 5488,5474 3475003 FRANKLIN LIMA 186775 544 2015.0 MG ... 5 0 NaN NaN 1298709 NaN 3475000 0 3475000 3
1098219 3119 2015 5878800 5284,5325 600000 SILAS FREIRE 187008 545 2015.0 PI ... 11 0 NaN NaN 1253003 NaN 600000 0 600000 0
1099002 3134 2016 5953208 5402,5397 1000000 FLAVIO NOGUEIRA 191923 556 2015.0 PI ... 3 0 NaN NaN 1276432 NaN 1000000 0 1000000 0
1099007 3134 2016 5980516 5465,5442,5430,5460,5433 727292 FLAVIO NOGUEIRA 191923 556 2015.0 PI ... 4 1 NaN NaN 1285214 NaN 750000 0 750000 -22708
1099246 3136 2015 5888535 5319,5310,5325 989000 ANGELA ALBINO 192120 559 2015.0 SC ... 12 0 NaN NaN 1255991 NaN 989000 0 989000 0
1099779 3145 2016 6024926 5488,5477 1390001 NIVALDO ALBUQUERQUE 194260 576 2015.0 AL ... 6 0 NaN NaN 1299549 NaN 1390000 0 1390000 1
1099785 3145 2016 6053859 5507,5511 1799999 NIVALDO ALBUQUERQUE 194260 576 2015.0 AL ... 7 0 NaN NaN 1308505 NaN 1800000 0 1800000 -1

1200 rows × 34 columns

The document with major positive difference in the dataset doesn't seem to have any irregularity. The deputy asked the reimbursement for R\$ 25.000,00, but received just R\$ 6.376,11.

In [43]:
with_significant_difference.sort_values('diff_net_value').iloc[0]
Out[43]:
applicant_id                                                 3031
year                                                         2015
document_id                                               5923763
reimbursement_number                                    5366,5387
net_value_int                                              637611
congressperson_name                                     ZÉ CARLOS
congressperson_id                                          178889
congressperson_document                                        86
term                                                         2015
state                                                          MA
party                                                          PT
term_id                                                        55
subquota_number                                                 5
subquota_description          Publicity of parliamentary activity
subquota_group_id                                               0
subquota_group_description                                    NaN
supplier                      TVT PRODUTORA DE AUDIO E VIDEO LTDA
cnpj_cpf                                           09547438000169
document_number                                           0000601
document_type                                                   0
issue_date                                    2015-12-23 00:00:00
document_value                                              25000
remark_value                                                    0
net_value                                                 6242.95
month                                                          12
installment                                                     1
passenger                                                     NaN
leg_of_the_trip                                               NaN
batch_number                                              1267115
reimbursement_value                                           NaN
document_value_int                                        2500000
remark_value_int                                                0
calc_net_value_int                                        2500000
diff_net_value                                           -1862389
Name: 1059230, dtype: object

Deputies receiving less money than they asked is OK. What if they are receiving more than they should (values in the document_value column)?

In [44]:
has_extra_reimbursement = with_significant_difference['diff_net_value'] > 0
extra_reimbursement = with_significant_difference[has_extra_reimbursement]. \
    sort_values('diff_net_value', ascending=False)
len(extra_reimbursement)
Out[44]:
95
In [45]:
extra_reimbursement.head()
Out[45]:
applicant_id year document_id reimbursement_number net_value_int congressperson_name congressperson_id congressperson_document term state ... month installment passenger leg_of_the_trip batch_number reimbursement_value document_value_int remark_value_int calc_net_value_int diff_net_value
982176 2899 2016 5951638 5396,5399 793763 ODORICO MONTEIRO 178868 107 2015.0 CE ... 2 0 NaN NaN 1275947 NaN 490000 18896 471104 322659
372214 1793 2015 5767189 5152 329886 LUIZ CARLOS BUSATO 141485 488 2015.0 RS ... 7 0 NaN NaN 1218862 NaN 345500 319502 25998 303888
207106 1411 2012 2450220 4140 226929 RICARDO BERZOINI 74793 390 2011.0 SP ... 12 1 NaN NaN 654789 NaN 384896 451754 -66858 293787
748508 2331 2015 5756825 5132,5156 566739 WASHINGTON REIS 160620 326 2015.0 RJ ... 7 0 NaN NaN 1215673 NaN 315851 0 315851 250888
1010313 2949 2016 5983086 5436 650000 JUNIOR MARRECA 178885 78 2015.0 MA ... 4 1 NaN NaN 1285906 NaN 650000 210000 440000 210000

5 rows × 34 columns

In [46]:
Chart(extra_reimbursement).mark_bar().encode(
    x=X('subquota_description:O',
        sort=SortField(field='subquota_description',
                       order='descending',
                       op='count')),
    y='count(*):Q',
)
In [47]:
extra_reimbursement.sort_values('diff_net_value', ascending=False).iloc[0]
Out[47]:
applicant_id                                      2899
year                                              2016
document_id                                    5951638
reimbursement_number                         5396,5399
net_value_int                                   793763
congressperson_name                   ODORICO MONTEIRO
congressperson_id                               178868
congressperson_document                            107
term                                              2015
state                                               CE
party                                             PROS
term_id                                             55
subquota_number                                      3
subquota_description              Fuels and lubricants
subquota_group_id                                    1
subquota_group_description        Veículos Automotores
supplier                      MIGUEL PETROLEO COMERCIO
cnpj_cpf                                09027878000195
document_number                                    403
document_type                                        0
issue_date                         2016-03-22 23:00:00
document_value                                    4900
remark_value                                    188.96
net_value                                      4354.66
month                                                2
installment                                          0
passenger                                          NaN
leg_of_the_trip                                    NaN
batch_number                                   1275947
reimbursement_value                                NaN
document_value_int                              490000
remark_value_int                                 18896
calc_net_value_int                              471104
diff_net_value                                  322659
Name: 982176, dtype: object

This document says that a deputy received R\$ 3.226,59 more than it should. Let's check all the datasets we have to see if there's any mistake in the calculation.

In [48]:
dataset[dataset['document_id'] == '5951638']
Out[48]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
134863 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5396 NaN 2899 490000 18896 435466 471104 -35638
134864 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5399 NaN 2899 490000 18896 358297 471104 -112807

2 rows × 34 columns

In [49]:
data_with_id[data_with_id['document_id'] == '5951638']
Out[49]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
134863 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5396 NaN 2899 490000 18896 435466 471104 -35638
134864 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5399 NaN 2899 490000 18896 358297 471104 -112807

2 rows × 34 columns

In [50]:
extra_reimbursement[extra_reimbursement['document_id'] == '5951638']
Out[50]:
applicant_id year document_id reimbursement_number net_value_int congressperson_name congressperson_id congressperson_document term state ... month installment passenger leg_of_the_trip batch_number reimbursement_value document_value_int remark_value_int calc_net_value_int diff_net_value
982176 2899 2016 5951638 5396,5399 793763 ODORICO MONTEIRO 178868 107 2015.0 CE ... 2 0 NaN NaN 1275947 NaN 490000 18896 471104 322659

1 rows × 34 columns

In [51]:
extra_reimbursement[extra_reimbursement['document_id'] == '5951638'].iloc[0]
Out[51]:
applicant_id                                      2899
year                                              2016
document_id                                    5951638
reimbursement_number                         5396,5399
net_value_int                                   793763
congressperson_name                   ODORICO MONTEIRO
congressperson_id                               178868
congressperson_document                            107
term                                              2015
state                                               CE
party                                             PROS
term_id                                             55
subquota_number                                      3
subquota_description              Fuels and lubricants
subquota_group_id                                    1
subquota_group_description        Veículos Automotores
supplier                      MIGUEL PETROLEO COMERCIO
cnpj_cpf                                09027878000195
document_number                                    403
document_type                                        0
issue_date                         2016-03-22 23:00:00
document_value                                    4900
remark_value                                    188.96
net_value                                      4354.66
month                                                2
installment                                          0
passenger                                          NaN
leg_of_the_trip                                    NaN
batch_number                                   1275947
reimbursement_value                                NaN
document_value_int                              490000
remark_value_int                                 18896
calc_net_value_int                              471104
diff_net_value                                  322659
Name: 982176, dtype: object

Doesn't seem to have. Calculations are doing what it's expected and are following our understanding of the dataset.

There's a possibility that this extra payment counted towards other requests for reimbursements. Let's check.

In [52]:
dataset[(dataset['applicant_id'] == 2899) & \
        (dataset['subquota_number'] == 3) & \
        (dataset['year'] == 2016) & \
        (dataset['month'].isin([1, 2, 3]))]
Out[52]:
document_id congressperson_name congressperson_id congressperson_document term state party term_id subquota_number subquota_description ... leg_of_the_trip batch_number reimbursement_number reimbursement_value applicant_id document_value_int remark_value_int net_value_int calc_net_value_int diff_net_value
134842 5936100 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1271344 5381 NaN 2899 2400 0 2400 2400 0
134844 5941750 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1274395 5392 NaN 2899 5000 0 5000 5000 0
134845 5911608 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1263204 5345 NaN 2899 10000 0 10000 10000 0
134847 5910002 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1263203 5344 NaN 2899 5000 0 5000 5000 0
134848 5915078 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265871 5354 NaN 2899 26121 0 26121 26121 0
134849 5918158 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265871 5354 NaN 2899 24176 0 24176 24176 0
134850 5923660 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1268782 5365 NaN 2899 24406 0 24406 24406 0
134851 5928513 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1268783 5365 NaN 2899 23682 0 23682 23682 0
134852 5919933 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265871 5354 NaN 2899 4000 0 4000 4000 0
134853 5919956 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265871 5354 NaN 2899 5000 0 5000 5000 0
134854 5919964 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265866 5354 NaN 2899 7000 0 7000 7000 0
134855 5919961 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265866 5354 NaN 2899 7000 0 7000 7000 0
134856 5919966 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265866 5354 NaN 2899 7000 0 7000 7000 0
134857 5919959 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265866 5354 NaN 2899 7000 0 7000 7000 0
134858 5943742 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1274395 5392 NaN 2899 5000 0 5000 5000 0
134859 5936081 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1271344 5381 NaN 2899 19318 0 19318 19318 0
134861 5911741 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1263220 5345 NaN 2899 358297 0 358297 358297 0
134863 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5396 NaN 2899 490000 18896 435466 471104 -35638
134864 5951638 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275947 5399 NaN 2899 490000 18896 358297 471104 -112807
134866 5976720 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1283945 5439 NaN 2899 490000 105102 488832 384898 103934
134870 5956477 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1277485 5404 NaN 2899 24796 0 24796 24796 0
134872 5941716 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1274395 5392 NaN 2899 25589 0 25589 25589 0
134873 5956500 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1277485 5404 NaN 2899 25773 0 25773 25773 0
134879 5946610 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1274395 5392 NaN 2899 22890 0 22890 22890 0
134888 5919898 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1265871 5354 NaN 2899 25193 0 25193 25193 0
134889 5931192 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1271344 5381 NaN 2899 23297 0 23297 23297 0
134890 5910093 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1262682 5342 NaN 2899 10000 0 10000 10000 0
134891 5947738 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275323 5393 NaN 2899 17558 0 17558 17558 0
134892 5947718 ODORICO MONTEIRO 178868 107 2015.0 CE PROS 55 3 Fuels and lubricants ... NaN 1275323 5393 NaN 2899 19800 0 19800 19800 0

29 rows × 34 columns

Can't see anything proving that. Going to analize the next reimbursement.

In [53]:
extra_reimbursement.sort_values('diff_net_value', ascending=False).iloc[1]
Out[53]:
applicant_id                                  1793
year                                          2015
document_id                                5767189
reimbursement_number                          5152
net_value_int                               329886
congressperson_name             LUIZ CARLOS BUSATO
congressperson_id                           141485
congressperson_document                        488
term                                          2015
state                                           RS
party                                          PTB
term_id                                         55
subquota_number                                  3
subquota_description          Fuels and lubricants
subquota_group_id                                1
subquota_group_description    Veículos Automotores
supplier                             METROPOLITANO
cnpj_cpf                            88587589000117
document_number                               1047
document_type                                    0
issue_date                     2015-08-11 00:00:00
document_value                                3455
remark_value                               3195.02
net_value                                  3298.86
month                                            7
installment                                      0
passenger                                      NaN
leg_of_the_trip                                NaN
batch_number                               1218862
reimbursement_value                            NaN
document_value_int                          345500
remark_value_int                            319502
calc_net_value_int                           25998
diff_net_value                              303888
Name: 372214, dtype: object
In [54]:
(extra_reimbursement['remark_value'] > 0).sum()
Out[54]:
55

Nope. Not a clue.

In [55]:
extra_reimbursement.sort_values('diff_net_value', ascending=False).iloc[2]
Out[55]:
applicant_id                                                              1411
year                                                                      2012
document_id                                                            2450220
reimbursement_number                                                      4140
net_value_int                                                           226929
congressperson_name                                           RICARDO BERZOINI
congressperson_id                                                        74793
congressperson_document                                                    390
term                                                                      2011
state                                                                       SP
party                                                                       PT
term_id                                                                     54
subquota_number                                                              8
subquota_description          Security service provided by specialized company
subquota_group_id                                                            0
subquota_group_description                                                 NaN
supplier                                      CASO /SISTEMAS DE SEGURANÇA LTDA
cnpj_cpf                                                        05697868000106
document_number                                                            854
document_type                                                                0
issue_date                                                 2012-12-13 00:00:00
document_value                                                         3848.96
remark_value                                                           4517.54
net_value                                                              2269.29
month                                                                       12
installment                                                                  1
passenger                                                                  NaN
leg_of_the_trip                                                            NaN
batch_number                                                            654789
reimbursement_value                                                        NaN
document_value_int                                                      384896
remark_value_int                                                        451754
calc_net_value_int                                                      -66858
diff_net_value                                                          293787
Name: 207106, dtype: object

Not yet.

In [56]:
extra_reimbursement.sort_values('diff_net_value', ascending=False).iloc[3]
Out[56]:
applicant_id                                             2331
year                                                     2015
document_id                                           5756825
reimbursement_number                                5132,5156
net_value_int                                          566739
congressperson_name                           WASHINGTON REIS
congressperson_id                                      160620
congressperson_document                                   326
term                                                     2015
state                                                      RJ
party                                                    PMDB
term_id                                                    55
subquota_number                                             3
subquota_description                     Fuels and lubricants
subquota_group_id                                           1
subquota_group_description               Veículos Automotores
supplier                      AUTO POSTO PARADA CERTA UM LTDA
cnpj_cpf                                       09036204000157
document_number                                           804
document_type                                               0
issue_date                                2015-08-04 00:00:00
document_value                                        3158.51
remark_value                                                0
net_value                                             3158.51
month                                                       7
installment                                                 0
passenger                                                 NaN
leg_of_the_trip                                           NaN
batch_number                                          1215673
reimbursement_value                                       NaN
document_value_int                                     315851
remark_value_int                                            0
calc_net_value_int                                     315851
diff_net_value                                         250888
Name: 748508, dtype: object
In [ ]: